- Getting Data
In [ ]:
import yfinance as yf
import pandas as pd
import numpy as np
import time
symbols = { 'Crude_Oil':"CL=F", 'Brent_Oil':"BZ=F", 'Natural_Gas':"NG=F", 'RBOB_Gasoline':"RB=F", 'Heating_Oil':"HO=F", 'Energy_SPDR_ETF':"XLE", 'VanEck_Oil_ETF':"OIH", 'SPDR_S&P_Exploration_ETF':"XOP", 'ExxonMobil':"XOM", 'Chevron':"CVX", 'BPplc':"BP", 'Shellplc':"SHEL", 'TotalEnergies':"TTE", 'Gold':"GC=F", 'US_13week_Treasury':"^IRX", 'US_10year_Treasury':"^TNX", 'Volatility_Index':"^VIX" }
start_date = "2023-01-01"
end_date = "2023-12-31"
def fetch_and_save_data(symbols, start_date, end_date):
for name, symbol in symbols.items():
print(f"Fetching data for {name} ({symbol})...")
# Fetch historical data
data = yf.download(symbol, start=start_date, end=end_date)
# Check if data is returned
if data.empty:
print(f"No data found for {name} ({symbol}) in 2023.")
continue
# Remove missing data
data.dropna(inplace=True)
# Calculate Logarithmic Returns
data['Log Return'] = np.log(data['Close'] / data['Close'].shift(1))
# Calculate Simple Returns
data['Simple Return'] = data['Close'].pct_change()
# Calculate Rolling Volatility (Annualized)
data['Volatility'] = data['Log Return'].rolling(window=21).std() * np.sqrt(252)
# Save data to CSV
filename = f"{name}_2023.csv"
data.to_csv(filename)
print(f"Data for {name} saved to {filename}")
# Print summary statistics
print(data[['Log Return', 'Simple Return', 'Volatility']].describe())
# Sleep to respect API rate limits
time.sleep(3)
In [ ]:
if __name__ == "__main__":
print("Starting data fetch...")
fetch_and_save_data(symbols, start_date, end_date)
print(f"Data fetch completed.")
import requests
import pandas as pd
from datetime import datetime
Starting data fetch... Fetching data for Crude_Oil (CL=F)... YF.download() has changed argument auto_adjust default to True
[*********************100%***********************] 1 of 1 completed
Data for Crude_Oil saved to Crude_Oil_2023.csv Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean -0.000286 -0.000060 0.332574 std 0.021288 0.021232 0.071661 min -0.057785 -0.056147 0.155874 25% -0.014003 -0.013906 0.288260 50% 0.002026 0.002028 0.345937 75% 0.013395 0.013485 0.388037 max 0.060881 0.062773 0.459743
[*********************100%***********************] 1 of 1 completed
Fetching data for Brent_Oil (BZ=F)...
Data for Brent_Oil saved to Brent_Oil_2023.csv Price Log Return Simple Return Volatility Ticker count 250.000000 250.000000 230.000000 mean -0.000254 -0.000056 0.309385 std 0.019963 0.019898 0.076432 min -0.057844 -0.056203 0.131897 25% -0.011749 -0.011680 0.260908 50% 0.001553 0.001554 0.332548 75% 0.013642 0.013736 0.363194 max 0.062680 0.064686 0.447963 Fetching data for Natural_Gas (NG=F)...
[*********************100%***********************] 1 of 1 completed
Data for Natural_Gas saved to Natural_Gas_2023.csv Price Log Return Simple Return Volatility Ticker count 250.000000 250.000000 230.000000 mean -0.001846 -0.000907 0.666541 std 0.043472 0.043258 0.171532 min -0.156924 -0.145231 0.425533 25% -0.031833 -0.031332 0.535535 50% -0.000899 -0.000898 0.604309 75% 0.028986 0.029411 0.772925 max 0.108172 0.114239 1.057179 Fetching data for RBOB_Gasoline (RB=F)...
[*********************100%***********************] 1 of 1 completed
Data for RBOB_Gasoline saved to RBOB_Gasoline_2023.csv Price Log Return Simple Return Volatility Ticker count 250.000000 250.000000 230.000000 mean -0.000464 -0.000211 0.356803 std 0.022547 0.022495 0.054429 min -0.071156 -0.068684 0.231217 25% -0.014693 -0.014585 0.323705 50% 0.002527 0.002530 0.357403 75% 0.015393 0.015512 0.388705 max 0.094215 0.098796 0.488964
[*********************100%***********************] 1 of 1 completed
Fetching data for Heating_Oil (HO=F)... Data for Heating_Oil saved to Heating_Oil_2023.csv Price Log Return Simple Return Volatility Ticker count 250.000000 250.000000 230.000000 mean -0.000759 -0.000509 0.353132 std 0.022432 0.022357 0.065057 min -0.075426 -0.072652 0.194995 25% -0.015041 -0.014928 0.312962 50% 0.000238 0.000238 0.347414 75% 0.014254 0.014356 0.387611 max 0.053332 0.054780 0.510412
Fetching data for Energy_SPDR_ETF (XLE)...
[*********************100%***********************] 1 of 1 completed
Data for Energy_SPDR_ETF saved to Energy_SPDR_ETF_2023.csv Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean 0.000118 0.000220 0.225758 std 0.014287 0.014284 0.049664 min -0.055196 -0.053700 0.130263 25% -0.009359 -0.009315 0.189967 50% 0.000458 0.000458 0.223895 75% 0.009067 0.009108 0.254982 max 0.044278 0.045273 0.358635 Fetching data for VanEck_Oil_ETF (OIH)...
[*********************100%***********************] 1 of 1 completed
Data for VanEck_Oil_ETF saved to VanEck_Oil_ETF_2023.csv Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean 0.000322 0.000523 0.310716 std 0.020092 0.020058 0.085889 min -0.076020 -0.073203 0.150593 25% -0.009345 -0.009301 0.251619 50% 0.000276 0.000276 0.314504 75% 0.013242 0.013330 0.362974 max 0.066095 0.068329 0.536218
[*********************100%***********************] 1 of 1 completed
Fetching data for SPDR_S&P_Exploration_ETF (XOP)... Data for SPDR_S&P_Exploration_ETF saved to SPDR_S&P_Exploration_ETF_2023.csv
Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean 0.000372 0.000530 0.281353 std 0.017823 0.017817 0.065161 min -0.065403 -0.063311 0.161612 25% -0.012211 -0.012137 0.236566 50% 0.001138 0.001138 0.278833 75% 0.012228 0.012303 0.314357 max 0.050920 0.052238 0.431697 Fetching data for ExxonMobil (XOM)...
[*********************100%***********************] 1 of 1 completed
Data for ExxonMobil saved to ExxonMobil_2023.csv Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean -0.000119 0.000003 0.248197 std 0.015660 0.015677 0.050133 min -0.051027 -0.049747 0.157601 25% -0.011008 -0.010947 0.206351 50% 0.000090 0.000090 0.244240 75% 0.009022 0.009063 0.284913 max 0.057326 0.059000 0.377416 Fetching data for Chevron (CVX)...
[*********************100%***********************] 1 of 1 completed
Data for Chevron saved to Chevron_2023.csv Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean -0.000464 -0.000359 0.226455 std 0.014534 0.014484 0.055295 min -0.069570 -0.067205 0.133583 25% -0.008345 -0.008310 0.188684 50% -0.000120 -0.000120 0.213583 75% 0.007778 0.007809 0.256822 max 0.047492 0.048637 0.367280 Fetching data for BPplc (BP)...
[*********************100%***********************] 1 of 1 completed
Data for BPplc saved to BPplc_2023.csv Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean 0.000305 0.000429 0.248770 std 0.015833 0.015799 0.067488 min -0.084175 -0.080730 0.140689 25% -0.008465 -0.008429 0.190797 50% 0.000777 0.000777 0.243896 75% 0.008956 0.008996 0.313371 max 0.080219 0.083525 0.376885 Fetching data for Shellplc (SHEL)...
[*********************100%***********************] 1 of 1 completed
Data for Shellplc saved to Shellplc_2023.csv Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean 0.000808 0.000902 0.216359 std 0.013770 0.013763 0.058839 min -0.067011 -0.064815 0.100732 25% -0.006992 -0.006968 0.180034 50% 0.001603 0.001605 0.215089 75% 0.009200 0.009242 0.233886 max 0.050328 0.051616 0.392383 Fetching data for TotalEnergies (TTE)...
[*********************100%***********************] 1 of 1 completed
Data for TotalEnergies saved to TotalEnergies_2023.csv Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean 0.000574 0.000677 0.228171 std 0.014331 0.014342 0.053382 min -0.044171 -0.043210 0.114297 25% -0.009592 -0.009546 0.190619 50% 0.001971 0.001973 0.231794 75% 0.010083 0.010134 0.255760 max 0.066970 0.069263 0.391471
[*********************100%***********************] 1 of 1 completed
Fetching data for Gold (GC=F)... Data for Gold saved to Gold_2023.csv Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean 0.000459 0.000494 0.127636 std 0.008321 0.008331 0.035496 min -0.028262 -0.027866 0.074316 25% -0.004029 -0.004021 0.099957 50% 0.000108 0.000108 0.121441 75% 0.004800 0.004812 0.152001 max 0.030608 0.031081 0.219037
[*********************100%***********************] 1 of 1 completed
Fetching data for US_13week_Treasury (^IRX)... Data for US_13week_Treasury saved to US_13week_Treasury_2023.csv Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean 0.000790 0.000830 0.105136 std 0.008907 0.008920 0.093383 min -0.053736 -0.052318 0.021988 25% -0.001909 -0.001907 0.037481 50% 0.000381 0.000381 0.060702 75% 0.002853 0.002857 0.144180 max 0.039283 0.040065 0.356717
[*********************100%***********************] 1 of 1 completed
Fetching data for US_10year_Treasury (^TNX)... Data for US_10year_Treasury saved to US_10year_Treasury_2023.csv Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean 0.000077 0.000258 0.294563 std 0.019115 0.019065 0.067933 min -0.060386 -0.058599 0.179998 25% -0.012288 -0.012213 0.256861 50% 0.000000 0.000000 0.280897 75% 0.012911 0.012994 0.312546 max 0.042855 0.043787 0.505455
[*********************100%***********************] 1 of 1 completed
Fetching data for Volatility_Index (^VIX)... Data for Volatility_Index saved to Volatility_Index_2023.csv Price Log Return Simple Return Volatility Ticker count 249.000000 249.000000 229.000000 mean -0.002447 -0.000950 0.866112 std 0.054650 0.055161 0.212825 min -0.155894 -0.144350 0.404276 25% -0.039640 -0.038865 0.714051 50% -0.004484 -0.004474 0.881074 75% 0.024762 0.025071 1.011744 max 0.168181 0.183150 1.328913
Data fetch completed.
In [ ]:
import pandas as pd
# Function to preprocess the data
def preprocess_data(filename):
# Read CSV (Skip first two rows if necessary)
df = pd.read_csv(filename, skiprows=2)
# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])
# Set 'Date' as Index
df.set_index('Date', inplace=True)
# Rename columns to appropriate names
df.columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Log Return', 'Simple Return', 'Volatility']
# Clean and convert columns to numeric types
for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
# Convert to string first, then replace commas and convert to numeric
df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', ''), errors='coerce')
# Check for any NaN values after conversion
if df.isnull().values.any():
print(f"Warning: There are NaN values in the DataFrame after conversion in {filename}.")
return df
# List of files to preprocess
files = [
"Crude_Oil_2023.csv",
"Brent_Oil_2023.csv",
"Natural_Gas_2023.csv",
"RBOB_Gasoline_2023.csv",
"Heating_Oil_2023.csv",
"Energy_SPDR_ETF_2023.csv",
"VanEck_Oil_ETF_2023.csv",
"SPDR_S&P_Exploration_ETF_2023.csv",
"ExxonMobil_2023.csv",
"Chevron_2023.csv",
"BPplc_2023.csv",
"Shellplc_2023.csv",
"TotalEnergies_2023.csv",
"Gold_2023.csv",
"US_13week_Treasury_2023.csv",
"US_10year_Treasury_2023.csv",
"Volatility_Index_2023.csv"
]
# Preprocess each file
dataframes = {}
for file in files:
dataframes[file] = preprocess_data(file)
print(f"Processed {file}:\n", dataframes[file].head(), "\n")
print(dataframes[file].info(), "\n")
print(dataframes[file].describe(), "\n")
print(dataframes[file].dtypes)
Warning: There are NaN values in the DataFrame after conversion in Crude_Oil_2023.csv.
Processed Crude_Oil_2023.csv:
Open High Low Close Volume Log Return \
Date
2023-01-03 76.930000 81.500000 76.599998 80.570000 338520 NaN
2023-01-04 72.839996 77.419998 72.730003 77.250000 352434 -0.054631
2023-01-05 73.669998 74.919998 72.459999 73.250000 300731 0.011330
2023-01-06 73.769997 75.470001 73.239998 73.970001 258128 0.001356
2023-01-09 74.629997 76.739998 73.470001 73.470001 329290 0.011590
Simple Return Volatility
Date
2023-01-03 NaN NaN
2023-01-04 -0.053165 NaN
2023-01-05 0.011395 NaN
2023-01-06 0.001357 NaN
2023-01-09 0.011658 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 250.000000 250.000000 250.000000 250.000000 250.000000
mean 77.597120 78.885520 76.370440 77.726560 313673.744000
std 6.008919 5.989190 6.098372 6.032712 97171.270189
min 66.739998 67.699997 63.639999 66.620003 0.000000
25% 72.832500 74.127502 71.712500 73.032503 275205.500000
50% 77.084999 78.074997 75.690002 77.155003 321811.500000
75% 81.017498 82.344997 80.099998 81.275002 367279.500000
max 93.680000 95.029999 91.389999 93.779999 559169.000000
Log Return Simple Return Volatility
count 249.000000 249.000000 229.000000
mean -0.000286 -0.000060 0.332574
std 0.021288 0.021232 0.071661
min -0.057785 -0.056147 0.155874
25% -0.014003 -0.013906 0.288260
50% 0.002026 0.002028 0.345937
75% 0.013395 0.013485 0.388037
max 0.060881 0.062773 0.459743
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in Brent_Oil_2023.csv.
Processed Brent_Oil_2023.csv:
Open High Low Close Volume Log Return \
Date
2023-01-03 82.099998 87.019997 81.769997 86.040001 27559 NaN
2023-01-04 77.839996 82.669998 77.720001 82.230003 24772 -0.053283
2023-01-05 78.690002 79.959999 77.610001 78.089996 28051 0.010861
2023-01-06 78.570000 80.570000 78.050003 78.809998 23767 -0.001526
2023-01-09 79.650002 81.370003 78.339996 78.480003 29985 0.013652
Simple Return Volatility
Date
2023-01-03 NaN NaN
2023-01-04 -0.051888 NaN
2023-01-05 0.010920 NaN
2023-01-06 -0.001525 NaN
2023-01-09 0.013746 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 251 non-null float64
1 High 251 non-null float64
2 Low 251 non-null float64
3 Close 251 non-null float64
4 Volume 251 non-null int64
5 Log Return 250 non-null float64
6 Simple Return 250 non-null float64
7 Volatility 230 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 251.000000 251.000000 251.000000 251.000000 251.000000
mean 82.190438 83.381753 80.988407 82.243865 26777.577689
std 5.577078 5.498371 5.664448 5.564913 9075.619608
min 71.839996 73.730003 68.199997 71.889999 0.000000
25% 77.445000 78.715000 76.450001 77.579998 21021.000000
50% 82.470001 83.470001 81.070000 82.580002 26913.000000
75% 85.814999 86.790001 84.555000 85.844997 32191.000000
max 96.550003 97.629997 94.959999 96.620003 59320.000000
Log Return Simple Return Volatility
count 250.000000 250.000000 230.000000
mean -0.000254 -0.000056 0.309385
std 0.019963 0.019898 0.076432
min -0.057844 -0.056203 0.131897
25% -0.011749 -0.011680 0.260908
50% 0.001553 0.001554 0.332548
75% 0.013642 0.013736 0.363194
max 0.062680 0.064686 0.447963
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in Natural_Gas_2023.csv.
Processed Natural_Gas_2023.csv:
Open High Low Close Volume Log Return Simple Return \
Date
2023-01-03 3.988 4.394 3.894 4.393 116837 NaN NaN
2023-01-04 4.172 4.219 3.900 4.008 99759 0.045106 0.046138
2023-01-05 3.720 4.175 3.651 4.155 116682 -0.114672 -0.108341
2023-01-06 3.710 3.839 3.520 3.764 105050 -0.002692 -0.002688
2023-01-09 3.910 4.128 3.781 3.810 130276 0.052506 0.053908
Volatility
Date
2023-01-03 NaN
2023-01-04 NaN
2023-01-05 NaN
2023-01-06 NaN
2023-01-09 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 251 non-null float64
1 High 251 non-null float64
2 Low 251 non-null float64
3 Close 251 non-null float64
4 Volume 251 non-null int64
5 Log Return 250 non-null float64
6 Simple Return 250 non-null float64
7 Volatility 230 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 251.000000 251.000000 251.000000 251.000000 251.000000
mean 2.665689 2.753845 2.591215 2.679028 137590.215139
std 0.402081 0.425602 0.391230 0.417368 53342.669274
min 1.991000 2.083000 1.944000 2.015000 0.000000
25% 2.382500 2.465500 2.316000 2.394500 109707.000000
50% 2.603000 2.674000 2.540000 2.620000 135522.000000
75% 2.827000 2.896500 2.760500 2.815000 164728.500000
max 4.172000 4.394000 3.900000 4.393000 330300.000000
Log Return Simple Return Volatility
count 250.000000 250.000000 230.000000
mean -0.001846 -0.000907 0.666541
std 0.043472 0.043258 0.171532
min -0.156924 -0.145231 0.425533
25% -0.031833 -0.031332 0.535535
50% -0.000899 -0.000898 0.604309
75% 0.028986 0.029411 0.772925
max 0.108172 0.114239 1.057179
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in RBOB_Gasoline_2023.csv.
Processed RBOB_Gasoline_2023.csv:
Open High Low Close Volume Log Return Simple Return \
Date
2023-01-03 2.3612 2.5057 2.3484 2.4909 65711 NaN NaN
2023-01-04 2.2592 2.3754 2.2433 2.3501 40222 -0.044159 -0.043198
2023-01-05 2.2671 2.3131 2.2554 2.2682 47548 0.003491 0.003497
2023-01-06 2.2446 2.3187 2.2356 2.2700 45192 -0.009974 -0.009925
2023-01-09 2.2929 2.3356 2.2510 2.2510 49984 0.021290 0.021518
Volatility
Date
2023-01-03 NaN
2023-01-04 NaN
2023-01-05 NaN
2023-01-06 NaN
2023-01-09 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 251 non-null float64
1 High 251 non-null float64
2 Low 251 non-null float64
3 Close 251 non-null float64
4 Volume 251 non-null int64
5 Log Return 250 non-null float64
6 Simple Return 250 non-null float64
7 Volatility 230 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 251.000000 251.000000 251.000000 251.000000 251.000000
mean 2.509494 2.548939 2.468406 2.511362 45500.243028
std 0.232477 0.229328 0.227835 0.229636 16806.739748
min 1.979700 2.041000 1.967200 1.987100 0.000000
25% 2.321550 2.359550 2.265700 2.314500 32296.000000
50% 2.545100 2.587100 2.502100 2.544000 46981.000000
75% 2.676700 2.711250 2.631850 2.678400 56205.000000
max 2.964900 2.993600 2.924000 2.949000 121248.000000
Log Return Simple Return Volatility
count 250.000000 250.000000 230.000000
mean -0.000464 -0.000211 0.356803
std 0.022547 0.022495 0.054429
min -0.071156 -0.068684 0.231217
25% -0.014693 -0.014585 0.323705
50% 0.002527 0.002530 0.357403
75% 0.015393 0.015512 0.388705
max 0.094215 0.098796 0.488964
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in Heating_Oil_2023.csv.
Processed Heating_Oil_2023.csv:
Open High Low Close Volume Log Return Simple Return \
Date
2023-01-03 3.0865 3.3322 3.0755 3.2965 58567 NaN NaN
2023-01-04 2.9719 3.0912 2.9215 3.0682 51609 -0.037836 -0.037129
2023-01-05 2.9723 3.0669 2.9200 3.0016 55068 0.000135 0.000135
2023-01-06 3.0045 3.0536 2.9520 2.9789 41596 0.010775 0.010833
2023-01-09 3.0360 3.1069 2.9949 3.0054 49943 0.010430 0.010484
Volatility
Date
2023-01-03 NaN
2023-01-04 NaN
2023-01-05 NaN
2023-01-06 NaN
2023-01-09 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 251 non-null float64
1 High 251 non-null float64
2 Low 251 non-null float64
3 Close 251 non-null float64
4 Volume 251 non-null int64
5 Log Return 250 non-null float64
6 Simple Return 250 non-null float64
7 Volatility 230 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 251.000000 251.000000 251.000000 251.000000 251.000000
mean 2.813668 2.863859 2.765468 2.817678 41625.661355
std 0.315287 0.323386 0.307672 0.316162 15232.702850
min 2.232300 2.269500 2.150000 2.210200 0.000000
25% 2.578600 2.621200 2.547800 2.580500 28271.500000
50% 2.777600 2.836900 2.734000 2.784200 43513.000000
75% 3.057050 3.108400 3.006750 3.069050 53725.000000
max 3.550900 3.580000 3.461500 3.540600 75580.000000
Log Return Simple Return Volatility
count 250.000000 250.000000 230.000000
mean -0.000759 -0.000509 0.353132
std 0.022432 0.022357 0.065057
min -0.075426 -0.072652 0.194995
25% -0.015041 -0.014928 0.312962
50% 0.000238 0.000238 0.347414
75% 0.014254 0.014356 0.387611
max 0.053332 0.054780 0.510412
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in Energy_SPDR_ETF_2023.csv.
Processed Energy_SPDR_ETF_2023.csv:
Open High Low Close Volume Log Return \
Date
2023-01-03 78.797256 81.560761 77.816956 81.149968 26541400 NaN
2023-01-04 78.787910 79.245381 77.331471 77.630228 22852600 -0.000119
2023-01-05 80.225685 80.664486 78.433143 78.657209 19361900 0.018084
2023-01-06 81.747475 82.746444 81.056600 81.261997 22211200 0.018791
2023-01-09 81.458054 83.110552 81.196642 82.979846 23001600 -0.003547
Simple Return Volatility
Date
2023-01-03 NaN NaN
2023-01-04 -0.000119 NaN
2023-01-05 0.018249 NaN
2023-01-06 0.018969 NaN
2023-01-09 -0.003540 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 250.000000 250.000000 250.000000 250.000000 2.500000e+02
mean 80.606490 81.366511 79.860143 80.628933 1.987130e+07
std 3.967614 3.945646 4.014046 4.049407 5.685873e+06
min 71.860481 72.950053 70.357359 70.637447 9.576300e+06
25% 77.535322 78.358228 77.143755 77.808466 1.636542e+07
50% 81.172474 81.807018 80.463929 81.071801 1.897520e+07
75% 83.351974 84.160872 82.582509 83.467739 2.283078e+07
max 88.847862 89.161915 88.467193 88.966643 5.767350e+07
Log Return Simple Return Volatility
count 249.000000 249.000000 229.000000
mean 0.000118 0.000220 0.225758
std 0.014287 0.014284 0.049664
min -0.055196 -0.053700 0.130263
25% -0.009359 -0.009315 0.189967
50% 0.000458 0.000458 0.223895
75% 0.009067 0.009108 0.254982
max 0.044278 0.045273 0.358635
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in VanEck_Oil_ETF_2023.csv.
Processed VanEck_Oil_ETF_2023.csv:
Open High Low Close Volume \
Date
2023-01-03 279.780548 293.673451 274.969228 291.596285 762600
2023-01-04 280.814331 283.104046 274.090099 275.056225 828600
2023-01-05 286.108704 287.896014 279.683939 280.978545 670200
2023-01-06 295.228912 299.180396 289.615720 290.833049 1247500
2023-01-09 301.653687 307.788566 300.368722 302.523194 1131200
Log Return Simple Return Volatility
Date
2023-01-03 NaN NaN NaN
2023-01-04 0.003688 0.003695 NaN
2023-01-05 0.018678 0.018854 NaN
2023-01-06 0.031379 0.031877 NaN
2023-01-09 0.021529 0.021762 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 250.000000 250.000000 250.000000 250.000000 2.500000e+02
mean 298.916737 302.779731 295.069055 299.015556 6.074528e+05
std 28.811957 28.778691 28.939455 28.961266 2.700343e+05
min 238.420685 243.859981 237.705746 238.894095 2.080000e+05
25% 273.099823 276.256640 268.674932 273.602193 4.089750e+05
50% 305.380280 309.602004 302.159718 306.407031 5.297000e+05
75% 320.797485 325.719885 318.034352 321.509976 7.366250e+05
max 350.027618 351.747322 346.472289 350.684577 1.918700e+06
Log Return Simple Return Volatility
count 249.000000 249.000000 229.000000
mean 0.000322 0.000523 0.310716
std 0.020092 0.020058 0.085889
min -0.076020 -0.073203 0.150593
25% -0.009345 -0.009301 0.251619
50% 0.000276 0.000276 0.314504
75% 0.013242 0.013330 0.362974
max 0.066095 0.068329 0.536218
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in SPDR_S&P_Exploration_ETF_2023.csv.
Processed SPDR_S&P_Exploration_ETF_2023.csv:
Open High Low Close Volume \
Date
2023-01-03 121.890556 128.636921 120.389252 127.335149 6321700
2023-01-04 122.318138 123.629407 119.182511 119.695615 5151500
2023-01-05 122.498688 123.553401 120.474782 121.757540 4576400
2023-01-06 125.007187 126.375465 123.648420 124.294543 2849100
2023-01-09 126.280418 129.226017 125.938349 128.057286 3898700
Log Return Simple Return Volatility
Date
2023-01-03 NaN NaN NaN
2023-01-04 0.003502 0.003508 NaN
2023-01-05 0.001475 0.001476 NaN
2023-01-06 0.020271 0.020478 NaN
2023-01-09 0.010134 0.010185 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 250.000000 250.000000 250.000000 250.000000 2.500000e+02
mean 130.352460 131.907668 128.786941 130.415726 4.582904e+06
std 9.609913 9.531173 9.744432 9.694111 1.620531e+06
min 111.200905 113.594735 108.473866 108.939458 2.005800e+06
25% 122.275383 123.673703 120.901568 122.411931 3.465050e+06
50% 130.912636 132.537463 128.727188 130.847782 4.352900e+06
75% 138.167980 139.437562 136.592073 137.770904 5.272400e+06
max 148.309601 149.444216 147.069642 148.656721 1.538520e+07
Log Return Simple Return Volatility
count 249.000000 249.000000 229.000000
mean 0.000372 0.000530 0.281353
std 0.017823 0.017817 0.065161
min -0.065403 -0.063311 0.161612
25% -0.012211 -0.012137 0.236566
50% 0.001138 0.001138 0.278833
75% 0.012228 0.012303 0.314357
max 0.050920 0.052238 0.431697
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in ExxonMobil_2023.csv.
Processed ExxonMobil_2023.csv:
Open High Low Close Volume \
Date
2023-01-03 98.713257 101.966313 97.767919 101.743883 15146200
2023-01-04 99.000557 99.315672 97.091355 97.128428 18058400
2023-01-05 101.215607 101.901435 98.527891 98.555694 15946600
2023-01-06 102.438980 103.792104 101.641933 102.021924 16348100
2023-01-09 100.529785 103.467733 99.918095 103.467733 17964600
Log Return Simple Return Volatility
Date
2023-01-03 NaN NaN NaN
2023-01-04 0.002906 0.002910 NaN
2023-01-05 0.022127 0.022374 NaN
2023-01-06 0.012014 0.012087 NaN
2023-01-09 -0.018813 -0.018637 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 250.000000 250.000000 250.000000 250.000000 2.500000e+02
mean 102.334976 103.402265 101.293446 102.350723 1.747948e+07
std 4.504475 4.514376 4.452520 4.511013 6.544464e+06
min 93.243523 94.578936 91.543774 92.561761 7.397000e+06
25% 98.984196 99.949766 97.890439 98.977115 1.347562e+07
50% 101.492382 102.777576 100.543262 101.813100 1.583425e+07
75% 105.413452 106.388596 104.385516 105.347774 1.944365e+07
max 114.160667 114.635542 112.251660 113.951724 5.793900e+07
Log Return Simple Return Volatility
count 249.000000 249.000000 229.000000
mean -0.000119 0.000003 0.248197
std 0.015660 0.015677 0.050133
min -0.051027 -0.049747 0.157601
25% -0.011008 -0.010947 0.206351
50% 0.000090 0.000090 0.244240
75% 0.009022 0.009063 0.284913
max 0.057326 0.059000 0.377416
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in Chevron_2023.csv.
Processed Chevron_2023.csv:
Open High Low Close Volume \
Date
2023-01-03 158.737152 163.307945 156.474552 162.432097 7565400
2023-01-04 157.049301 159.010815 155.872395 155.963621 7684500
2023-01-05 159.877563 160.744279 156.228224 156.465426 6088200
2023-01-06 161.081818 163.663727 160.297209 161.501498 7191200
2023-01-09 159.822830 162.696694 159.092973 162.632824 8385600
Log Return Simple Return Volatility
Date
2023-01-03 NaN NaN NaN
2023-01-04 -0.010690 -0.010633 NaN
2023-01-05 0.017849 0.018009 NaN
2023-01-06 0.007504 0.007532 NaN
2023-01-09 -0.007847 -0.007816 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 250.000000 250.000000 250.000000 250.000000 2.500000e+02
mean 148.810178 150.219206 147.493921 148.928152 8.646823e+06
std 7.686490 7.739064 7.639849 7.794345 3.794895e+06
min 133.188736 134.596238 132.991662 134.230300 3.272600e+06
25% 143.320148 144.802549 141.737522 143.218724 6.400250e+06
50% 148.376137 150.168435 147.480297 148.933785 7.474050e+06
75% 155.577522 156.863780 154.584672 155.632771 1.000020e+07
max 171.327362 171.345613 167.395201 168.772836 3.316180e+07
Log Return Simple Return Volatility
count 249.000000 249.000000 229.000000
mean -0.000464 -0.000359 0.226455
std 0.014534 0.014484 0.055295
min -0.069570 -0.067205 0.133583
25% -0.008345 -0.008310 0.188684
50% -0.000120 -0.000120 0.213583
75% 0.007778 0.007809 0.256822
max 0.047492 0.048637 0.367280
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in BPplc_2023.csv.
Processed BPplc_2023.csv:
Open High Low Close Volume Log Return \
Date
2023-01-03 30.648390 31.425315 30.442996 31.255643 7967400 NaN
2023-01-04 30.166155 30.416202 29.942900 30.184016 7492400 -0.015860
2023-01-05 30.273323 30.353695 29.969697 29.987557 6440200 0.003546
2023-01-06 30.773409 30.898431 30.380482 30.532294 6855400 0.016384
2023-01-09 31.050249 31.478898 30.898434 31.380665 7831300 0.008956
Simple Return Volatility
Date
2023-01-03 NaN NaN
2023-01-04 -0.015734 NaN
2023-01-05 0.003553 NaN
2023-01-06 0.016519 NaN
2023-01-09 0.008996 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 250.000000 250.000000 250.000000 250.000000 2.500000e+02
mean 33.795749 34.063363 33.527429 33.802296 8.530306e+06
std 1.661742 1.671672 1.671519 1.677034 3.690183e+06
min 30.166155 30.353695 29.942900 29.987557 3.688000e+06
25% 32.520833 32.796460 32.275912 32.563105 6.359875e+06
50% 33.406439 33.699594 33.205290 33.447248 7.753350e+06
75% 35.509333 35.866389 35.333720 35.610078 9.179825e+06
max 37.641953 37.669621 37.309897 37.531264 3.522320e+07
Log Return Simple Return Volatility
count 249.000000 249.000000 229.000000
mean 0.000305 0.000429 0.248770
std 0.015833 0.015799 0.067488
min -0.084175 -0.080730 0.140689
25% -0.008465 -0.008429 0.190797
50% 0.000777 0.000777 0.243896
75% 0.008956 0.008996 0.313371
max 0.080219 0.083525 0.376885
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in Shellplc_2023.csv.
Processed Shellplc_2023.csv:
Open High Low Close Volume Log Return \
Date
2023-01-03 51.137989 52.795999 51.101448 52.499112 7766476 NaN
2023-01-04 50.635567 51.128860 50.361516 50.982699 5712988 -0.009873
2023-01-05 50.681244 50.763184 50.215358 50.233629 4266050 0.000902
2023-01-06 52.298141 52.330114 51.192802 51.265880 5845518 0.031405
2023-01-09 53.047215 53.532287 52.709219 53.129430 4939246 0.014222
Simple Return Volatility
Date
2023-01-03 NaN NaN
2023-01-04 -0.009825 NaN
2023-01-05 0.000902 NaN
2023-01-06 0.031903 NaN
2023-01-09 0.014323 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 250.000000 250.000000 250.000000 250.000000 2.500000e+02
mean 57.496620 57.897276 57.085029 57.464945 4.936137e+06
std 3.440649 3.438978 3.526171 3.554715 1.720621e+06
min 49.741489 50.437595 48.376945 46.826244 2.036330e+06
25% 55.240446 55.711387 54.597055 55.153586 3.693633e+06
50% 56.828331 57.209628 56.467426 56.762466 4.597592e+06
75% 60.703148 61.156732 60.364325 60.714167 5.767324e+06
max 64.488960 64.667699 64.056213 64.366665 1.277934e+07
Log Return Simple Return Volatility
count 249.000000 249.000000 229.000000
mean 0.000808 0.000902 0.216359
std 0.013770 0.013763 0.058839
min -0.067011 -0.064815 0.100732
25% -0.006992 -0.006968 0.180034
50% 0.001603 0.001605 0.215089
75% 0.009200 0.009242 0.233886
max 0.050328 0.051616 0.392383
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in TotalEnergies_2023.csv.
Processed TotalEnergies_2023.csv:
Open High Low Close Volume Log Return \
Date
2023-01-03 55.388927 56.894455 55.370897 56.678091 1876900 NaN
2023-01-04 55.064388 55.821656 54.766887 55.488097 1507900 -0.005877
2023-01-05 55.091434 55.262721 54.685753 54.730828 1097600 0.000491
2023-01-06 55.740524 56.281430 55.388931 55.677415 2221500 0.011713
2023-01-09 56.101124 56.867412 55.893776 56.696122 2200000 0.006448
Simple Return Volatility
Date
2023-01-03 NaN NaN
2023-01-04 -0.005859 NaN
2023-01-05 0.000491 NaN
2023-01-06 0.011782 NaN
2023-01-09 0.006469 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 250.000000 250.000000 250.000000 250.000000 2.500000e+02
mean 57.728019 58.167583 57.348465 57.793098 1.435535e+06
std 3.630004 3.641039 3.679748 3.686205 5.987413e+05
min 50.874840 51.294990 49.610235 50.079022 5.047000e+05
25% 54.894386 55.448546 54.506603 54.970169 1.020625e+06
50% 57.324015 57.588621 56.885443 57.163453 1.293200e+06
75% 61.273447 61.753788 61.022727 61.507760 1.690550e+06
max 64.820984 65.261491 64.633532 64.952200 4.315900e+06
Log Return Simple Return Volatility
count 249.000000 249.000000 229.000000
mean 0.000574 0.000677 0.228171
std 0.014331 0.014342 0.053382
min -0.044171 -0.043210 0.114297
25% -0.009592 -0.009546 0.190619
50% 0.001971 0.001973 0.231794
75% 0.010083 0.010134 0.255760
max 0.066970 0.069263 0.391471
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in Gold_2023.csv.
Processed Gold_2023.csv:
Open High Low Close Volume \
Date
2023-01-03 1839.699951 1839.699951 1836.199951 1836.199951 29
2023-01-04 1852.800049 1859.099976 1845.599976 1845.599976 25
2023-01-05 1834.800049 1855.199951 1834.800049 1855.199951 24
2023-01-06 1864.199951 1868.199951 1835.300049 1838.400024 26
2023-01-09 1872.699951 1880.000000 1867.000000 1867.000000 62
Log Return Simple Return Volatility
Date
2023-01-03 NaN NaN NaN
2023-01-04 0.007096 0.007121 NaN
2023-01-05 -0.009763 -0.009715 NaN
2023-01-06 0.015896 0.016023 NaN
2023-01-09 0.004549 0.004560 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume \
count 250.000000 250.000000 250.000000 250.000000 250.000000
mean 1942.769197 1950.744001 1935.432801 1942.983197 3985.948000
std 60.083909 61.702305 59.592685 60.484172 22478.477684
min 1808.800049 1808.800049 1808.099976 1808.099976 0.000000
25% 1911.849976 1917.375000 1903.224976 1910.325043 59.500000
50% 1944.450012 1957.549988 1937.900024 1945.799988 202.000000
75% 1983.175018 1990.250031 1977.624969 1984.499969 541.500000
max 2081.899902 2130.199951 2066.500000 2081.600098 194253.000000
Log Return Simple Return Volatility
count 249.000000 249.000000 229.000000
mean 0.000459 0.000494 0.127636
std 0.008321 0.008331 0.035496
min -0.028262 -0.027866 0.074316
25% -0.004029 -0.004021 0.099957
50% 0.000108 0.000108 0.121441
75% 0.004800 0.004812 0.152001
max 0.030608 0.031081 0.219037
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in US_13week_Treasury_2023.csv.
Processed US_13week_Treasury_2023.csv:
Open High Low Close Volume Log Return Simple Return \
Date
2023-01-03 4.255 4.260 4.225 4.260 0 NaN NaN
2023-01-04 4.400 4.400 4.378 4.390 0 0.033510 0.034078
2023-01-05 4.498 4.500 4.400 4.408 0 0.022028 0.022273
2023-01-06 4.493 4.520 4.455 4.510 0 -0.001112 -0.001112
2023-01-09 4.483 4.503 4.458 4.490 0 -0.002228 -0.002226
Volatility
Date
2023-01-03 NaN
2023-01-04 NaN
2023-01-05 NaN
2023-01-06 NaN
2023-01-09 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume Log Return \
count 250.00000 250.000000 250.000000 250.00000 250.0 249.000000
mean 5.04516 5.061180 5.010628 5.04550 0.0 0.000790
std 0.29235 0.284998 0.304987 0.29456 0.0 0.008907
min 4.25500 4.260000 4.225000 4.25800 0.0 -0.053736
25% 4.81175 4.833750 4.715500 4.75425 0.0 -0.001909
50% 5.20300 5.216500 5.142500 5.20500 0.0 0.000381
75% 5.26800 5.277250 5.252250 5.27000 0.0 0.002853
max 5.34800 5.348000 5.348000 5.34800 0.0 0.039283
Simple Return Volatility
count 249.000000 229.000000
mean 0.000830 0.105136
std 0.008920 0.093383
min -0.052318 0.021988
25% -0.001907 0.037481
50% 0.000381 0.060702
75% 0.002857 0.144180
max 0.040065 0.356717
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in US_10year_Treasury_2023.csv.
Processed US_10year_Treasury_2023.csv:
Open High Low Close Volume Log Return Simple Return \
Date
2023-01-03 3.793 3.810 3.724 3.758 0 NaN NaN
2023-01-04 3.709 3.728 3.664 3.677 0 -0.022395 -0.022146
2023-01-05 3.720 3.784 3.698 3.728 0 0.002961 0.002966
2023-01-06 3.569 3.754 3.558 3.746 0 -0.041438 -0.040591
2023-01-09 3.517 3.591 3.508 3.589 0 -0.014677 -0.014570
Volatility
Date
2023-01-03 NaN
2023-01-04 NaN
2023-01-05 NaN
2023-01-06 NaN
2023-01-09 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume Log Return \
count 250.000000 250.000000 250.000000 250.000000 250.0 249.000000
mean 3.965036 4.004144 3.922412 3.962016 0.0 0.000077
std 0.429600 0.430860 0.431096 0.430831 0.0 0.019115
min 3.287000 3.305000 3.253000 3.268000 0.0 -0.060386
25% 3.593750 3.639000 3.553000 3.586750 0.0 -0.012288
50% 3.871500 3.906500 3.848000 3.873500 0.0 0.000000
75% 4.263000 4.298750 4.233500 4.277000 0.0 0.012911
max 4.988000 4.997000 4.894000 4.997000 0.0 0.042855
Simple Return Volatility
count 249.000000 229.000000
mean 0.000258 0.294563
std 0.019065 0.067933
min -0.058599 0.179998
25% -0.012213 0.256861
50% 0.000000 0.280897
75% 0.012994 0.312546
max 0.043787 0.505455
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
Warning: There are NaN values in the DataFrame after conversion in Volatility_Index_2023.csv.
Processed Volatility_Index_2023.csv:
Open High Low Close Volume Log Return \
Date
2023-01-03 22.900000 23.76 22.730000 23.090000 0 NaN
2023-01-04 22.010000 23.27 21.940001 22.930000 0 -0.039640
2023-01-05 22.459999 22.92 21.969999 22.200001 0 0.020239
2023-01-06 21.129999 22.90 21.000000 22.690001 0 -0.061042
2023-01-09 21.969999 21.98 21.270000 21.750000 0 0.038984
Simple Return Volatility
Date
2023-01-03 NaN NaN
2023-01-04 -0.038865 NaN
2023-01-05 0.020445 NaN
2023-01-06 -0.059216 NaN
2023-01-09 0.039754 NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2023-01-03 to 2023-12-29
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 250 non-null float64
1 High 250 non-null float64
2 Low 250 non-null float64
3 Close 250 non-null float64
4 Volume 250 non-null int64
5 Log Return 249 non-null float64
6 Simple Return 249 non-null float64
7 Volatility 229 non-null float64
dtypes: float64(7), int64(1)
memory usage: 17.6 KB
None
Open High Low Close Volume Log Return \
count 250.000000 250.000000 250.000000 250.000000 250.0 249.000000
mean 16.870040 17.830640 16.356960 17.119480 0.0 -0.002447
std 3.139177 3.576509 2.885418 3.172261 0.0 0.054650
min 12.070000 12.460000 11.810000 11.960000 0.0 -0.155894
25% 13.932500 14.525000 13.732500 14.182500 0.0 -0.039640
50% 16.935000 17.785001 16.345000 16.959999 0.0 -0.004484
75% 19.080000 20.047500 18.549999 19.377500 0.0 0.024762
max 26.520000 30.809999 24.000000 27.770000 0.0 0.168181
Simple Return Volatility
count 249.000000 229.000000
mean -0.000950 0.866112
std 0.055161 0.212825
min -0.144350 0.404276
25% -0.038865 0.714051
50% -0.004474 0.881074
75% 0.025071 1.011744
max 0.183150 1.328913
Open float64
High float64
Low float64
Close float64
Volume int64
Log Return float64
Simple Return float64
Volatility float64
dtype: object
In [ ]:
import matplotlib.pyplot as plt
for file in files:
data = dataframes[file]
plt.figure(figsize=(10, 6))
plt.plot(data['Close']) # Replace 'Close' with your column name
plt.title('Closing Prices Over Time')
plt.xlabel('DatetimeIndex')
plt.ylabel('Close Price')
plt.show()
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Load the dataset
for file in files:
data = dataframes[file]
data['Close'] = data['Close'].replace([np.inf, -np.inf], np.nan)
data['Close'].ffill() # Forward fill
data['Close'].bfill() # Backward fill
result = adfuller(data['Close'])
print('ADF Statistic:', result[0])
print('p-value:', result[1])
if result[1]>0.05 :
data['Close_diff'] = data['Close'].diff()
data['Close_diff'].ffill() # Fill NaNs after differencing
# Check stationarity after differencing
result_diff = adfuller(data['Close_diff'].dropna())
print('ADF Statistic after Differencing:', result_diff[0])
print('p-value after Differencing:', result_diff[1])
# Determine ARIMA parameters (p, d, q)
plot_acf(data['Close_diff'].dropna(), lags=40)
plot_pacf(data['Close_diff'].dropna(), lags=40)
plt.show()
# Fit the ARIMA model (replace p, d, q with your values based on ACF/PACF)
p, d, q = 1, 1, 2 # Example values, adjust based on your analysis
model = ARIMA(data['Close'], order=(p, d, q))
model_fit = model.fit()
# Summary of the model
print(model_fit.summary())
# Forecasting
forecast = model_fit.forecast(steps=30) # Forecast for the next 30 periods
plt.figure(figsize=(10, 6))
plt.plot(data['Close'], label='Historical Data')
plt.plot(forecast, label='Forecast', color='orange')
plt.title('ARIMA Forecast of Closing Prices')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.show()
ADF Statistic: -2.2037246442165834 p-value: 0.20489640433604267 ADF Statistic after Differencing: -7.511608371394751 p-value after Differencing: 4.001877169606792e-11
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 250
Model: ARIMA(1, 1, 2) Log Likelihood -479.144
Date: Fri, 07 Mar 2025 AIC 966.288
Time: 18:23:35 BIC 980.358
Sample: 0 HQIC 971.951
- 250
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.4263 0.627 0.680 0.496 -0.802 1.655
ma.L1 -0.3443 0.624 -0.551 0.581 -1.568 0.879
ma.L2 -0.1209 0.069 -1.763 0.078 -0.255 0.014
sigma2 2.7471 0.236 11.631 0.000 2.284 3.210
===================================================================================
Ljung-Box (L1) (Q): 0.01 Jarque-Bera (JB): 2.03
Prob(Q): 0.92 Prob(JB): 0.36
Heteroskedasticity (H): 1.00 Skew: -0.18
Prob(H) (two-sided): 0.99 Kurtosis: 3.25
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
ADF Statistic: -2.356414871643807 p-value: 0.15441759946820255 ADF Statistic after Differencing: -7.519275116258346 p-value after Differencing: 3.82900614703005e-11
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq)
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 251
Model: ARIMA(1, 1, 2) Log Likelihood -468.292
Date: Fri, 07 Mar 2025 AIC 944.584
Time: 18:23:36 BIC 958.670
Sample: 0 HQIC 950.254
- 251
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.8578 0.134 6.407 0.000 0.595 1.120
ma.L1 -0.7427 0.139 -5.343 0.000 -1.015 -0.470
ma.L2 -0.1602 0.063 -2.547 0.011 -0.284 -0.037
sigma2 2.4796 0.223 11.096 0.000 2.042 2.918
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 5.94
Prob(Q): 0.96 Prob(JB): 0.05
Heteroskedasticity (H): 1.03 Skew: -0.38
Prob(H) (two-sided): 0.89 Kurtosis: 3.01
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
ADF Statistic: -4.230410970627573 p-value: 0.0005850054404811237 ADF Statistic: -1.5930813263111132 p-value: 0.48706805903440736 ADF Statistic after Differencing: -15.20607092960768 p-value after Differencing: 5.7403941610307795e-28
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 251
Model: ARIMA(1, 1, 2) Log Likelihood 357.730
Date: Fri, 07 Mar 2025 AIC -707.461
Time: 18:23:37 BIC -693.375
Sample: 0 HQIC -701.792
- 251
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.0956 30.903 0.003 0.998 -60.474 60.665
ma.L1 -0.0506 30.898 -0.002 0.999 -60.610 60.509
ma.L2 -0.0065 1.392 -0.005 0.996 -2.734 2.721
sigma2 0.0033 0.000 14.821 0.000 0.003 0.004
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 50.85
Prob(Q): 0.98 Prob(JB): 0.00
Heteroskedasticity (H): 0.54 Skew: -0.32
Prob(H) (two-sided): 0.01 Kurtosis: 5.12
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
ADF Statistic: -1.8642061278731388 p-value: 0.3490824021195 ADF Statistic after Differencing: -15.388315496075856 p-value after Differencing: 3.3359286162502505e-28
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
return get_prediction_index(
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.
return get_prediction_index(
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 251
Model: ARIMA(1, 1, 2) Log Likelihood 332.434
Date: Fri, 07 Mar 2025 AIC -656.869
Time: 18:23:38 BIC -642.783
Sample: 0 HQIC -651.200
- 251
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.0243 74.925 0.000 1.000 -146.827 146.875
ma.L1 0.0243 74.926 0.000 1.000 -146.827 146.876
ma.L2 -0.0003 3.647 -9.36e-05 1.000 -7.149 7.148
sigma2 0.0041 0.000 11.656 0.000 0.003 0.005
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 3.50
Prob(Q): 0.96 Prob(JB): 0.17
Heteroskedasticity (H): 0.86 Skew: -0.27
Prob(H) (two-sided): 0.51 Kurtosis: 3.21
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
ADF Statistic: -2.407085800533096 p-value: 0.13974647341271612 ADF Statistic after Differencing: -15.885336836753535 p-value after Differencing: 8.663252375545324e-29
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 250
Model: ARIMA(1, 1, 2) Log Likelihood -403.045
Date: Fri, 07 Mar 2025 AIC 814.090
Time: 18:23:39 BIC 828.159
Sample: 0 HQIC 819.753
- 250
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.3419 0.427 -0.801 0.423 -1.179 0.495
ma.L1 0.3529 0.426 0.829 0.407 -0.481 1.187
ma.L2 -0.0992 0.065 -1.518 0.129 -0.227 0.029
sigma2 1.4908 0.131 11.392 0.000 1.234 1.747
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 1.08
Prob(Q): 0.99 Prob(JB): 0.58
Heteroskedasticity (H): 0.50 Skew: -0.15
Prob(H) (two-sided): 0.00 Kurtosis: 3.11
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
ADF Statistic: -1.7385971808852456 p-value: 0.41134372256779017 ADF Statistic after Differencing: -11.947236185782739 p-value after Differencing: 4.401029357726941e-22
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 250
Model: ARIMA(1, 1, 2) Log Likelihood -812.038
Date: Fri, 07 Mar 2025 AIC 1632.077
Time: 18:23:40 BIC 1646.147
Sample: 0 HQIC 1637.740
- 250
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.1640 0.490 -0.335 0.738 -1.124 0.796
ma.L1 0.2253 0.490 0.459 0.646 -0.736 1.186
ma.L2 -0.0951 0.069 -1.378 0.168 -0.230 0.040
sigma2 39.8209 3.166 12.579 0.000 33.616 46.026
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 3.66
Prob(Q): 0.96 Prob(JB): 0.16
Heteroskedasticity (H): 0.46 Skew: -0.07
Prob(H) (two-sided): 0.00 Kurtosis: 3.58
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
ADF Statistic: -2.011030727205811 p-value: 0.28175755214705084 ADF Statistic after Differencing: -16.2162791677464 p-value after Differencing: 3.9394890209660076e-29
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 250
Model: ARIMA(1, 1, 2) Log Likelihood -578.759
Date: Fri, 07 Mar 2025 AIC 1165.518
Time: 18:23:41 BIC 1179.588
Sample: 0 HQIC 1171.181
- 250
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.2149 0.780 -0.275 0.783 -1.744 1.314
ma.L1 0.2014 0.782 0.258 0.797 -1.331 1.734
ma.L2 -0.0766 0.060 -1.273 0.203 -0.195 0.041
sigma2 6.1151 0.558 10.953 0.000 5.021 7.209
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 1.99
Prob(Q): 0.99 Prob(JB): 0.37
Heteroskedasticity (H): 0.61 Skew: -0.22
Prob(H) (two-sided): 0.03 Kurtosis: 2.96
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
ADF Statistic: -2.851425951173653 p-value: 0.05129509801105939 ADF Statistic after Differencing: -12.478638254300106 p-value after Differencing: 3.1272223854677037e-23
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 250
Model: ARIMA(1, 1, 2) Log Likelihood -482.359
Date: Fri, 07 Mar 2025 AIC 972.718
Time: 18:23:41 BIC 986.788
Sample: 0 HQIC 978.382
- 250
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.3358 0.371 -0.904 0.366 -1.064 0.392
ma.L1 0.3356 0.369 0.910 0.363 -0.387 1.058
ma.L2 -0.1303 0.061 -2.141 0.032 -0.250 -0.011
sigma2 2.8188 0.250 11.291 0.000 2.330 3.308
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 2.65
Prob(Q): 0.99 Prob(JB): 0.27
Heteroskedasticity (H): 0.57 Skew: -0.25
Prob(H) (two-sided): 0.01 Kurtosis: 3.05
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
ADF Statistic: -2.485018534492955 p-value: 0.1191621590776008 ADF Statistic after Differencing: -15.90438866986888 p-value after Differencing: 8.25912546722968e-29
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 250
Model: ARIMA(1, 1, 2) Log Likelihood -545.645
Date: Fri, 07 Mar 2025 AIC 1099.289
Time: 18:23:42 BIC 1113.359
Sample: 0 HQIC 1104.952
- 250
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.2324 0.508 -0.457 0.647 -1.229 0.764
ma.L1 0.2410 0.503 0.479 0.632 -0.746 1.228
ma.L2 -0.1015 0.061 -1.676 0.094 -0.220 0.017
sigma2 4.6866 0.400 11.727 0.000 3.903 5.470
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 1.81
Prob(Q): 0.98 Prob(JB): 0.40
Heteroskedasticity (H): 0.51 Skew: -0.18
Prob(H) (two-sided): 0.00 Kurtosis: 3.21
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
ADF Statistic: -2.961931402381136 p-value: 0.03860572787149292 ADF Statistic: -2.2496766323220068 p-value: 0.18871193673201914 ADF Statistic after Differencing: -18.551387651009644 p-value after Differencing: 2.092251140994728e-30
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq)
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 250
Model: ARIMA(1, 1, 2) Log Likelihood -351.203
Date: Fri, 07 Mar 2025 AIC 710.406
Time: 18:23:44 BIC 724.476
Sample: 0 HQIC 716.069
- 250
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.8376 0.090 9.351 0.000 0.662 1.013
ma.L1 -1.0342 0.103 -10.040 0.000 -1.236 -0.832
ma.L2 0.0923 0.064 1.442 0.149 -0.033 0.218
sigma2 0.9821 0.043 22.628 0.000 0.897 1.067
===================================================================================
Ljung-Box (L1) (Q): 0.05 Jarque-Bera (JB): 1075.00
Prob(Q): 0.82 Prob(JB): 0.00
Heteroskedasticity (H): 0.29 Skew: -1.29
Prob(H) (two-sided): 0.00 Kurtosis: 12.85
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
ADF Statistic: -1.5745421006952112 p-value: 0.4963679655554053 ADF Statistic after Differencing: -6.294550627148641 p-value after Differencing: 3.5319865896368846e-08
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 250
Model: ARIMA(1, 1, 2) Log Likelihood -324.763
Date: Fri, 07 Mar 2025 AIC 657.525
Time: 18:23:45 BIC 671.595
Sample: 0 HQIC 663.188
- 250
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.7320 0.176 4.151 0.000 0.386 1.078
ma.L1 -0.8161 0.185 -4.412 0.000 -1.179 -0.454
ma.L2 -0.0240 0.077 -0.311 0.756 -0.175 0.127
sigma2 0.7948 0.060 13.184 0.000 0.677 0.913
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 34.78
Prob(Q): 0.96 Prob(JB): 0.00
Heteroskedasticity (H): 0.60 Skew: -0.40
Prob(H) (two-sided): 0.02 Kurtosis: 4.64
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
ADF Statistic: -1.6674665747390252 p-value: 0.44794778085361125 ADF Statistic after Differencing: -4.671961998026451 p-value after Differencing: 9.497316833766148e-05
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 250
Model: ARIMA(1, 1, 2) Log Likelihood -1034.789
Date: Fri, 07 Mar 2025 AIC 2077.579
Time: 18:23:46 BIC 2091.648
Sample: 0 HQIC 2083.242
- 250
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.0188 0.708 -0.027 0.979 -1.407 1.369
ma.L1 0.0912 0.711 0.128 0.898 -1.302 1.484
ma.L2 -0.0891 0.086 -1.041 0.298 -0.257 0.079
sigma2 238.3134 18.904 12.607 0.000 201.262 275.365
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 3.81
Prob(Q): 0.95 Prob(JB): 0.15
Heteroskedasticity (H): 0.78 Skew: 0.06
Prob(H) (two-sided): 0.27 Kurtosis: 3.59
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
ADF Statistic: -2.1315838827076186 p-value: 0.2320056291934623 ADF Statistic after Differencing: -6.2095890644999026 p-value after Differencing: 5.549472251241006e-08
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq)
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 250
Model: ARIMA(1, 1, 2) Log Likelihood 382.552
Date: Fri, 07 Mar 2025 AIC -757.103
Time: 18:23:48 BIC -743.033
Sample: 0 HQIC -751.440
- 250
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.8704 0.025 -35.316 0.000 -0.919 -0.822
ma.L1 0.5165 0.057 9.022 0.000 0.404 0.629
ma.L2 -0.4692 0.031 -15.110 0.000 -0.530 -0.408
sigma2 0.0027 0.000 19.474 0.000 0.002 0.003
===================================================================================
Ljung-Box (L1) (Q): 0.10 Jarque-Bera (JB): 3030.84
Prob(Q): 0.75 Prob(JB): 0.00
Heteroskedasticity (H): 0.03 Skew: -1.46
Prob(H) (two-sided): 0.00 Kurtosis: 19.84
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
ADF Statistic: -1.3714694524933961 p-value: 0.5958750433232802 ADF Statistic after Differencing: -12.217911691427327 p-value after Differencing: 1.1245245775203403e-22
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq)
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 250
Model: ARIMA(1, 1, 2) Log Likelihood 300.303
Date: Fri, 07 Mar 2025 AIC -592.606
Time: 18:23:49 BIC -578.536
Sample: 0 HQIC -586.942
- 250
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.0882 0.564 0.156 0.876 -1.017 1.193
ma.L1 -0.0443 0.563 -0.079 0.937 -1.147 1.059
ma.L2 -0.1112 0.066 -1.678 0.093 -0.241 0.019
sigma2 0.0052 0.000 13.452 0.000 0.004 0.006
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 22.28
Prob(Q): 1.00 Prob(JB): 0.00
Heteroskedasticity (H): 0.77 Skew: -0.49
Prob(H) (two-sided): 0.23 Kurtosis: 4.09
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
ADF Statistic: -2.1842216820409472 p-value: 0.2120218324904929 ADF Statistic after Differencing: -11.058379271011715 p-value after Differencing: 4.879328137171754e-20
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index( /usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception. return get_prediction_index(
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 250
Model: ARIMA(1, 1, 2) Log Likelihood -358.166
Date: Fri, 07 Mar 2025 AIC 724.331
Time: 18:23:50 BIC 738.401
Sample: 0 HQIC 729.995
- 250
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.8895 0.083 -10.746 0.000 -1.052 -0.727
ma.L1 0.7443 0.105 7.075 0.000 0.538 0.951
ma.L2 -0.0087 0.064 -0.137 0.891 -0.134 0.117
sigma2 1.0389 0.068 15.372 0.000 0.906 1.171
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 41.40
Prob(Q): 0.97 Prob(JB): 0.00
Heteroskedasticity (H): 0.58 Skew: 0.25
Prob(H) (two-sided): 0.01 Kurtosis: 4.93
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer # Import SimpleImputer
# Load the dataset
for file in files:
data = dataframes[file]
X = data[['Open','High','Low','Volume','Log Return','Simple Return','Volatility']] # Replace
y = data['Close'] # Target variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create an imputer to fill NaN values with the mean
imputer = SimpleImputer(strategy='mean')
# Fit the imputer on the training data and transform both training and testing data
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
# Create a linear regression model
model = LinearRegression()
# Fit the model to the training data
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
# Plotting the results
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2) # Diagonal line
plt.xlabel('Actual Close Prices')
plt.ylabel('Predicted Close Prices')
plt.title('Actual vs Predicted Close Prices')
plt.show()
Mean Squared Error: 0.1813687566677913 R-squared: 0.9961555540816833
Mean Squared Error: 0.11533009611589407 R-squared: 0.9963099463726139
Mean Squared Error: 0.0017224296658761961 R-squared: 0.9883147020108428
Mean Squared Error: 0.0002008577228944818 R-squared: 0.9957808710203938
Mean Squared Error: 0.00018706988982735527 R-squared: 0.9984121086165332
Mean Squared Error: 0.1425501471625193 R-squared: 0.9916387603632241
Mean Squared Error: 2.9693765885142356 R-squared: 0.9962178416914171
Mean Squared Error: 0.5487861761102647 R-squared: 0.9931862472403402
Mean Squared Error: 0.23021411432534136 R-squared: 0.9888584299259457
Mean Squared Error: 0.4346677578903055 R-squared: 0.9937763746499695
Mean Squared Error: 0.022405156468870823 R-squared: 0.9934826699000399
Mean Squared Error: 0.05799087080325329 R-squared: 0.9954513356239653
Mean Squared Error: 0.048691398172960865 R-squared: 0.9962854214055153
Mean Squared Error: 26.199643107684075 R-squared: 0.9910209166852437
Mean Squared Error: 0.00023841164018041864 R-squared: 0.9974033065559662
Mean Squared Error: 0.00041059050378594803 R-squared: 0.9978837480393812
Mean Squared Error: 0.11066308656023366 R-squared: 0.9859075069976125
In [ ]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# Load the dataset
for file in files:
data = dataframes[file] # Replace with your file path
# Prepare the data
# Select features and target variable
# Ensure features are numeric and drop rows with non-numeric values
X = data[['Open', 'High', 'Low', 'Volume', 'Simple Return', 'Log Return', 'Volatility']].apply(pd.to_numeric, errors='coerce').dropna()
y = data['Close'].apply(pd.to_numeric, errors='coerce').dropna() # Ensure target is numeric
# Reindex y to align with X after dropping rows
y = y.reindex(X.index)
# Check if there is enough data after dropping rows
if len(X) < 2 or len(y) < 2:
print(f"Skipping {file} due to insufficient data after cleaning.")
continue # Skip to the next file
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
# Fit the model to the training data
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Results for {file}:')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
# Plotting the results
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2) # Diagonal line
plt.xlabel('Actual Close Prices')
plt.ylabel('Predicted Close Prices')
plt.title(f'Actual vs Predicted Close Prices using Random Forest for {file}')
plt.show()
Results for Crude_Oil_2023.csv: Mean Squared Error: 0.5802596279319124 R-squared: 0.984265298729512
Results for Brent_Oil_2023.csv: Mean Squared Error: 0.5201383695127169 R-squared: 0.9830945628333059
Results for Natural_Gas_2023.csv: Mean Squared Error: 0.0016168512747690238 R-squared: 0.9765213310751648
Results for RBOB_Gasoline_2023.csv: Mean Squared Error: 0.0005004918138169276 R-squared: 0.9905118249777439
Results for Heating_Oil_2023.csv: Mean Squared Error: 0.0011989067820372468 R-squared: 0.986460754483685
Results for Energy_SPDR_ETF_2023.csv: Mean Squared Error: 0.2751918315766939 R-squared: 0.983213325221418
Results for VanEck_Oil_ETF_2023.csv: Mean Squared Error: 9.851069227803611 R-squared: 0.987462201637468
Results for SPDR_S&P_Exploration_ETF_2023.csv: Mean Squared Error: 1.323724018462013 R-squared: 0.9857080884015251
Results for ExxonMobil_2023.csv: Mean Squared Error: 0.3937363769141254 R-squared: 0.9808345374442137
Results for Chevron_2023.csv: Mean Squared Error: 0.6016592819174916 R-squared: 0.986821507014054
Results for BPplc_2023.csv: Mean Squared Error: 0.02933753998424803 R-squared: 0.9883533265196377
Results for Shellplc_2023.csv: Mean Squared Error: 0.09466437936425558 R-squared: 0.9894474793594216
Results for TotalEnergies_2023.csv: Mean Squared Error: 0.11317455262623008 R-squared: 0.9915673378027832
Results for Gold_2023.csv: Mean Squared Error: 33.71895283403927 R-squared: 0.9899899708860083
Results for US_13week_Treasury_2023.csv: Mean Squared Error: 0.0004889629950096581 R-squared: 0.9901711821336804
Results for US_10year_Treasury_2023.csv: Mean Squared Error: 0.001340729666759866 R-squared: 0.9912895687142032
Results for Volatility_Index_2023.csv: Mean Squared Error: 0.15357745575949064 R-squared: 0.9838224315727264
In [ ]:
#Monte Carlo Simuation
import pandas as pd
for file in files:
df = dataframes[file]
mean_return = df['Simple Return'].mean()
std_dev_return = df['Simple Return'].std()
num_simulations = 10000
num_days = 252 # Number of trading days to simulate
last_price = df['Close'].iloc[-1]
print(f"Mean Return: {mean_return}")
print(f"Standard Deviation of Returns: {std_dev_return}")
simulated_prices = np.zeros((num_days, num_simulations))
for i in range(num_simulations):
random_returns = np.random.normal(mean_return, std_dev_return, num_days)
price_path = last_price * (1 + random_returns).cumprod()
simulated_prices[:, i] = price_path
plt.figure(figsize=(12, 6))
plt.plot(simulated_prices, color='blue', alpha=0.1)
plt.title('Monte Carlo Simulation of Price Paths')
plt.xlabel('Days')
plt.ylabel('Price')
plt.show()
final_prices = simulated_prices[-1, :]
Mean Return: -6.0272543964092305e-05 Standard Deviation of Returns: 0.021231645734711615
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-23-72b84e02d85d> in <cell line: 0>() 17 simulated_prices[:, i] = price_path 18 plt.figure(figsize=(12, 6)) ---> 19 plt.plot(simulated_prices, color='blue', alpha=0.1) 20 plt.title('Monte Carlo Simulation of Price Paths') 21 plt.xlabel('Days') /usr/local/lib/python3.11/dist-packages/matplotlib/pyplot.py in plot(scalex, scaley, data, *args, **kwargs) 3827 **kwargs, 3828 ) -> list[Line2D]: -> 3829 return gca().plot( 3830 *args, 3831 scalex=scalex, /usr/local/lib/python3.11/dist-packages/matplotlib/axes/_axes.py in plot(self, scalex, scaley, data, *args, **kwargs) 1777 lines = [*self._get_lines(self, *args, data=data, **kwargs)] 1778 for line in lines: -> 1779 self.add_line(line) 1780 if scalex: 1781 self._request_autoscale_view("x") /usr/local/lib/python3.11/dist-packages/matplotlib/axes/_base.py in add_line(self, line) 2367 self._set_artist_props(line) 2368 if line.get_clip_path() is None: -> 2369 line.set_clip_path(self.patch) 2370 2371 self._update_line_limits(line) /usr/local/lib/python3.11/dist-packages/matplotlib/artist.py in set_clip_path(self, path, transform) 813 if isinstance(path, Rectangle): 814 self.clipbox = TransformedBbox(Bbox.unit(), --> 815 path.get_transform()) 816 self._clippath = None 817 success = True /usr/local/lib/python3.11/dist-packages/matplotlib/patches.py in get_transform(self) 307 def get_transform(self): 308 """Return the `~.transforms.Transform` applied to the `Patch`.""" --> 309 return self.get_patch_transform() + artist.Artist.get_transform(self) 310 311 def get_data_transform(self): /usr/local/lib/python3.11/dist-packages/matplotlib/patches.py in get_patch_transform(self) 813 return transforms.BboxTransformTo(bbox) \ 814 + transforms.Affine2D() \ --> 815 .translate(-rotation_point[0], -rotation_point[1]) \ 816 .scale(1, self._aspect_ratio_correction) \ 817 .rotate_deg(self.angle) \ /usr/local/lib/python3.11/dist-packages/matplotlib/transforms.py in translate(self, tx, ty) 2035 self._mtx[0, 2] += tx 2036 self._mtx[1, 2] += ty -> 2037 self.invalidate() 2038 return self 2039 /usr/local/lib/python3.11/dist-packages/matplotlib/transforms.py in invalidate(self) 158 ancestors. Should be called any time the transform changes. 159 """ --> 160 return self._invalidate_internal( 161 level=self._INVALID_AFFINE_ONLY if self.is_affine else self._INVALID_FULL, 162 invalidating_node=self) /usr/local/lib/python3.11/dist-packages/matplotlib/transforms.py in _invalidate_internal(self, level, invalidating_node) 172 return 173 self._invalid = level --> 174 for parent in list(self._parents.values()): 175 parent = parent() # Dereference the weak reference. 176 if parent is not None: KeyboardInterrupt:
In [ ]:
final_prices = simulated_prices[-1, :]
plt.figure(figsize=(12, 6))
plt.hist(final_prices, bins=50, alpha=0.7)
plt.title('Distribution of Final Prices After Simulation')
plt.xlabel('Final Price')
plt.ylabel('Frequency')
plt.show()
In [ ]:
In [ ]: